Preprocessing
The first step is to extract all the text from a document and preprocess it into a format suitable to be analyzed.
Tech stack
The packages used for this level are:
-
pandas: To read and manipulate dataframes
-
fitz: To extract text from documents
-
nltk: To remove stopwords and extract important words
-
spacy: To extract dates from the documents
Processing retention schedule
The retention schedule is given as an excel sheet with lots of empty/redundant rows and columns. It is first processed into a clean csv file.
import pandas as pd
xls = pd.read_excel(file_path, sheet_name=None)
csv_files = {sheet_name: sheet.to_csv(index=False) for sheet_name, sheet in xls.items()}
for sheet_name, csv_data in csv_files.items():
with open(f"{sheet_name}.csv", "w") as csv_file:
csv_file.write(csv_data)
def preprocess_csv(file_path):
df = pd.read_csv(file_path)
df = df.drop([0, 1, 2]).dropna(how='all', axis=1).dropna(how='all', axis=0).reset_index(drop=True)
new_header = df.iloc[0]
df = df.drop(0)
df.columns = new_header
df = df.dropna(subset=['Ref'], how='any').reset_index(drop=True)
df.to_csv('processed_Simplified.csv', index=False, header=True)
return None
preprocess_csv("Simplified.csv")
Extract text
Next, each document is read and its text contents are stored in a dictionary.
import fitz
import os
def extract_text_from_pdf(pdf_file, folder_path):
text = ""
with fitz.open(os.path.join(folder_path, pdf_file)) as doc:
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
text += page.get_text("text")
return text
folder_path = './repository'
pdf_files = []
for root, dirs, files in os.walk(folder_path):
for file in files:
pdf_files.append(file)
file_contents = {}
for pdf_file in pdf_files:
file_contents[pdf_file] = {"file_name": pdf_file, "text": (extract_text_from_pdf(pdf_file, folder_path))}
Process text
Now the text contents of each document is processed into a format suitable for analysis.
Remove hyphens and replace ligatures
First, hyphens are removed and ligatures ("st", "Æ", etc) are replaced by standard characters.
from typing import List
def replace_ligatures(text: str) -> str:
ligatures = {
"ff": "ff",
"fi": "fi",
"fl": "fl",
"ffi": "ffi",
"ffl": "ffl",
"ſt": "ft",
"st": "st",
"Ꜳ": "AA",
"Æ": "AE",
"ꜳ": "aa",
}
for search, replace in ligatures.items():
text = text.replace(search, replace)
return text
def remove_hyphens(text: str) -> str:
"""
This fails for:
* Natural dashes: well-known, self-replication, use-cases, non-semantic,
Post-processing, Window-wise, viewpoint-dependent
* Trailing math operands: 2 - 4
* Names: Lopez-Ferreras, VGG-19, CIFAR-100
"""
lines = [line.rstrip() for line in text.split("\n")]
line_numbers = []
for line_no, line in enumerate(lines[:-1]):
if line.endswith("-"):
line_numbers.append(line_no)
for line_no in line_numbers:
lines = dehyphenate(lines, line_no)
return "\n".join(lines)
def dehyphenate(lines: List[str], line_no: int) -> List[str]:
next_line = lines[line_no + 1]
word_suffix = next_line.split(" ")[0]
lines[line_no] = lines[line_no][:-1] + word_suffix
lines[line_no + 1] = lines[line_no + 1][len(word_suffix):]
return lines
for file in file_contents.keys():
file_contents[file]["text"] = remove_hyphens(replace_ligatures(file_contents[file]["text"]))
Remove stopwords
Stopwords are words like articles (a, an, the), conjunctions (of, in, at), etc which don't add any semantic meaning to the text. Removing them and retaining only the meaningful words results in better analysis.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
def extract_important_words(text):
words = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
tagged_words = pos_tag(filtered_words)
important_words = [word for word, tag in tagged_words if tag in ('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS')]
return ' '.join(important_words)
for file in file_contents.keys():
file_contents[file]["imp_words"] = extract_important_words(file_contents[file]["text"])
Extract date
Next, we get the date of creation of the document from the text contents.
import spacy
from dateutil import parser
nlp = spacy.load("en_core_web_sm")
def extract_all_dates(text):
doc = nlp(text)
dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
return dates
def convert(dates):
formatted_dates = []
for date in dates:
try:
parsed_date = parser.parse(date, fuzzy=True)
formatted_date = parsed_date.strftime('%Y-%m-%d')
# Retain only the dates which are not impossible
if formatted_date >= "2010-01-01" and formatted_date <= "2024-04-10":
formatted_dates.append(formatted_date)
except:
# If the date cannot be parsed, move on to the next one since it is invalid
continue
return formatted_dates
dates = {}
for file in file_contents.keys():
text = file_contents[file]["text"]
all_dates = convert(extract_all_dates(text))
if all_dates:
dates[file] = all_dates[0]